Length-at-age data preparation and modelling

Author

Max Lindmark & Asta Audzijonyte

Published

February 4, 2026

Load packages

pkgs <- c("here", "tidyverse", "RColorBrewer", "viridis", "mgcv", "sdmTMB", "patchwork", "tidylog", "gratia", "readxl")

# minpack.lm needed if using nlsLM()
if (length(setdiff(pkgs, rownames(installed.packages()))) > 0) {
  install.packages(setdiff(pkgs, rownames(installed.packages())), dependencies = T)
}

invisible(lapply(pkgs, library, character.only = T))

# Continuous colors
options(ggplot2.continuous.colour = "viridis")

# Not on CRAN !
# devtools::install_github("seananderson/ggsidekick")
library(ggsidekick)
theme_set(theme_sleek())

# Set relative path
home <- here::here()

Read data

Read and join fish data and temperature data.

# Clean data
df <- read_excel(paste0(home, "/data/DruksiaiFish_2022march.xls")) |> dplyr::select(-"...15")

df <- df |>
  rename(
    year = Year,
    age = Age,
    area = Zone,
    species = Species,
    sex = Sex,
    month = Month,
    data_source = Data_source
  ) |>
  mutate(
    stage = NA,
    stage = ifelse(year < 1982, "a", stage),
    stage = ifelse(year >= 1982 & year <= 2003, "b", stage),
    stage = ifelse(year >= 2004, "c", stage)
  )


df2 <- df |>
  mutate(
    area = case_match(area,
      c("šalta z.", "šalta z. ") ~ "cold",
      c("šilta z.", "šilta z. ") ~ "warm",
      c("37 kv.", "4 kv.", "49 kv.") ~ "unclear",
      .default = area
    )
  )

# Add temperatures
temp_df <- read_csv(file = paste0(home, "/output/merged_temp_covar_data.csv")) |>
  filter(.width == 0.95) |> # this just because rows are duplicate for each width! doesnt matter sinse we use median instead (ignore that the column is called mean; it's calculated with median_qi from tidybayes)
  dplyr::select(mean_lst, year) |>
  rename(lst = mean_lst)

df <- left_join(df, temp_df, by = "year")

df <- df |>
  mutate(lst = ifelse(year < min(temp_df$year), mean(filter(temp_df, year < 1982)$lst), lst),
         year = as.numeric(year))

xtabs(~ species + year, data = df)
#>                             year
#> species                      1965 1978 1979 1980 1981 1982 1983 1984 1985 1986
#>   Abramis brama                 0    0    0   77  396  233  199  171  509   30
#>   Alburnus alburnus             0    0    0    0    0    0    0    0  232   37
#>   Blicca bjoerkna               0    0    0    4    0    0    0   48  131    0
#>   Carassius carassius           0    0    0    0    0    0    0    0    0    0
#>   Carassius gibelio             0    0    0    0    0    0    0    0    6    0
#>   Coregonus albula             99    0    0    0    0    0    0   40    7  135
#>   Cyprinus carpio               0    0    0    0    0    0    0    0    0    0
#>   Cyprinus cyprio               0    0    0    0    0    0    0    0    0    0
#>   Esox lucius                   0   48    0  188   86    9   20    0   37   18
#>   Gymnocephalus cernua          0    0    0    0    0    0    0    0  101    0
#>   Leuciscus idus                0    0    0    0    8    0    0    0    8    0
#>   Osmerus eperlanus             0    0    0    0    0    0    0    0  129    0
#>   Perca fluviatilis             0   24    2   97  118   57   32    0  194    0
#>   Rutilus rutilus               0   38  107  251  220  374  255  494 1014   97
#>   Sardinius erythrophthalmus    0    0    0    0    1    9    0    0   10    0
#>   Tinca tinca                   0    0    0    0    0    0    0    0    7    0
#>                             year
#> species                      1987 1988 1989 1990 1991 1992 1993 1994 1995 1996
#>   Abramis brama                 4  112   13    0  621  110    1    0    7  319
#>   Alburnus alburnus             0    0   28    0  234   43   42    0   77   77
#>   Blicca bjoerkna               0    0    0    0  196   59    0    0    5  336
#>   Carassius carassius           0    0    0    0   14    0    0    0    0    0
#>   Carassius gibelio             0    0    0    0   22    1    0    0    0    0
#>   Coregonus albula             17   10    0    0  158  103   95    0    2   72
#>   Cyprinus carpio               0    0    0    0    0    0    0    0    0    0
#>   Cyprinus cyprio               0    0    0    0    1    0    0    0    0    0
#>   Esox lucius                   2    6    1    0   92   11    4    0   20    0
#>   Gymnocephalus cernua          0    0    0    0   61   23    0   14   80   78
#>   Leuciscus idus                0    0    0    0   10    1    0    0    0    0
#>   Osmerus eperlanus             0    0    0    0    2    0    0    0    0    0
#>   Perca fluviatilis             0   21  141    0  374   70   26    0  562  574
#>   Rutilus rutilus             368  188  844   44  930  504  442   71   39  489
#>   Sardinius erythrophthalmus    0    0    0    0  157   12    0   25    6  386
#>   Tinca tinca                   0    0    0    0   49    1    0    0    0    0
#>                             year
#> species                      1997 1998 1999 2005 2006 2007 2008 2009 2010 2011
#>   Abramis brama               163  133  154   45   13   42   25   56   37   40
#>   Alburnus alburnus            97   91  135    0    0    0    0    0    0    0
#>   Blicca bjoerkna             274  232  157    9    0    0    0    0    0    0
#>   Carassius carassius           0    0    0    0    0    0    0    0    0    0
#>   Carassius gibelio             0    0    1    0    0    0    0    0    0    1
#>   Coregonus albula             64   17   50    4   20   56   23   55    8    3
#>   Cyprinus carpio               0    0    0    0    1    0    0    0    0    0
#>   Cyprinus cyprio               0    0    0    0    0    0    0    0    0    0
#>   Esox lucius                   0    1    6    1    5    4    6   13   38   25
#>   Gymnocephalus cernua         44   49   74    0    0    1    0    0    0    0
#>   Leuciscus idus                0    0    0    0    0    0    0    0    0    0
#>   Osmerus eperlanus             0    0    1    0    0    0    0    0    0    0
#>   Perca fluviatilis           293  354  252   39   30   49   80  114  167  132
#>   Rutilus rutilus             373  398  298   54   48   73   67   99  147  116
#>   Sardinius erythrophthalmus  313  227  188    0    0    2    1    0    8    6
#>   Tinca tinca                   0    2    3    3   12   16    8    8   38   27
#>                             year
#> species                      2012 2014 2015 2016 2017 2018 2019 2020 2021
#>   Abramis brama                41   29   41   54   54   31   45   33   56
#>   Alburnus alburnus            24    0    2    2    0    0    3    0    8
#>   Blicca bjoerkna               2    0    0    6    0    0    0    2    5
#>   Carassius carassius           2    0    0    2    0    0    0    0    1
#>   Carassius gibelio             1    2    0    0    1    0    0    0    1
#>   Coregonus albula             57   28   21   88   31   36   26   13   16
#>   Cyprinus carpio               0    0    0    0    0    0    0    0    0
#>   Cyprinus cyprio               0    0    0    0    0    0    0    0    0
#>   Esox lucius                  25    8   13   18    7    6   23    8    7
#>   Gymnocephalus cernua          0    0    0    0    0    0    0   12    3
#>   Leuciscus idus                0    0    0    0    0    0    0    0    0
#>   Osmerus eperlanus             0    0    8   18    0    2    1    7    4
#>   Perca fluviatilis            53   62   23   87   76   78  108   32   39
#>   Rutilus rutilus              70   53   18  108   61    6   48    0   57
#>   Sardinius erythrophthalmus   14    1    0   12    0    2    0    6   29
#>   Tinca tinca                  38   11    6   36   22   40   30    3   42

length(which(is.na(df$L == T)))
#> [1] 1673
length(which(is.na(df$l == T)))
#> [1] 283

# Rename species to common names
unique(df$species)
#>  [1] "Coregonus albula"           "Rutilus rutilus"           
#>  [3] "Esox lucius"                "Perca fluviatilis"         
#>  [5] "Abramis brama"              "Blicca bjoerkna"           
#>  [7] "Leuciscus idus"             "Sardinius erythrophthalmus"
#>  [9] "Alburnus alburnus"          "Gymnocephalus cernua"      
#> [11] "Osmerus eperlanus"          "Carassius gibelio"         
#> [13] "Tinca tinca"                "Carassius carassius"       
#> [15] "Cyprinus cyprio"            "Cyprinus carpio"

df <- df |>
  filter(species %in% c(
    "Abramis brama", "Alburnus alburnus", "Blicca bjoerkna", "Gymnocephalus cernua",
    "Gymnocephalus cernua", "Sardinius erythrophthalmus", "Tinca tinca",
    "Osmerus eperlanus", "Perca fluviatilis", "Coregonus albula",
    "Esox lucius", "Rutilus rutilus"
  )) |>
  mutate(
    species = fct_recode(species,
      "Bream" = "Abramis brama",
      "Bleak" = "Alburnus alburnus",
      "Silver bream" = "Blicca bjoerkna",
      "Ruffe" = "Gymnocephalus cernua",
      "Rudde" = "Sardinius erythrophthalmus",
      "Tench" = "Tinca tinca",
      "Smelt" = "Osmerus eperlanus",
      "Perch" = "Perca fluviatilis",
      "Vendace" = "Coregonus albula",
      "Pike" = "Esox lucius",
      "Roach" = "Rutilus rutilus"
    )
  ) |>
  drop_na(l)

# Filter ages
df |>
  filter(species == "Roach") |>
  ggplot(aes(l)) +
  geom_histogram() +
  facet_wrap(~year, scales = "free_y")


# Some odd observations in roach - likely to be L and not l. For now we remove them
df <- df |>
  filter(
    !(species == "Roach" & month == 5 & year == "1980"),
    !(species == "Roach" & month == 8 & year == "1980" & age < 7),
    !(species == "Roach" & age == 4 & l < 2)
  )

# Explore length vs age
ggplot(df, aes(age, l, color = data_source)) +
  facet_wrap(~species, scales = "free", ncol = 5) +
  geom_point(alpha = 0.2) +
  theme(legend.position = "bottom")


# Some data entry errors in bream as well, filter these! (a 1 yr old bream is not 37 cm!)
df <- df |>
  mutate(keep = ifelse(species == "Bream" & age < 2 & l > 30, "N", "Y")) |>
  filter(keep == "Y") |>
  dplyr::select(-keep)

# And in vendece...
df <- df |>
  mutate(keep = ifelse(species == "Vendace" & age >= 3 & l < 12, "N", "Y")) |>
  filter(keep == "Y") |>
  dplyr::select(-keep)

# Replace NA with new level
df <- df |> mutate(data_source = replace_na(data_source, "missing_data_source"))

To get covariates that represent the average temperatures during the lifetime, we’ll here add lagged variables. This code can certainly be written in a better way but it does the job …

# Add various lagged temperature metrics (average lifetime temperature etc). We don't have temperatures for the oldest cohorts, so we give them the average of the stage a period

df <- df |> mutate(cohort = year - age)
#> mutate: new variable 'cohort' (double) with 62 unique values and <1% NA
min(df$cohort, na.rm = TRUE)
#> [1] 1961
min(temp_df$year)
#> [1] 1978
min(temp_df$year) - min(df$cohort, na.rm = TRUE)
#> [1] 17

# Need to extend it 17 years
years_to_add <- seq(from = min(df$cohort, na.rm = TRUE), to = min(temp_df$year - 1))

temp_df_temp <- data.frame(
  year = years_to_add,
  lst = rep(mean(filter(temp_df, year < 1982)$lst), length(years_to_add)),
  stage = rep("a", length(years_to_add))
)
#> filter: removed 41 rows (91%), 4 rows remaining

temp_df2 <- bind_rows(temp_df_temp, temp_df) |>
  mutate(
    stage = NA,
    stage = ifelse(year < 1982, "a", stage),
    stage = ifelse(year >= 1982 & year <= 2003, "b", stage),
    stage = ifelse(year >= 2004, "c", stage)
  ) |>
  arrange(year)
#> mutate: changed 45 values (73%) of 'stage' (45 fewer NA)

# Ok, now, based on the cohort and age, we need to average certain columns and match to the length-at-age data...
# Lag variables equal to the oldest fish in the data
temp_df2 <- temp_df2 |>
  mutate(
    lst_t_minus_1 = lag(lst, n = 1),
    lst_t_minus_2 = lag(lst, n = 2),
    lst_t_minus_3 = lag(lst, n = 3),
    lst_t_minus_4 = lag(lst, n = 4),
    lst_t_minus_5 = lag(lst, n = 5),
    lst_t_minus_6 = lag(lst, n = 6),
    lst_t_minus_7 = lag(lst, n = 7),
    lst_t_minus_8 = lag(lst, n = 8),
    lst_t_minus_9 = lag(lst, n = 9),
    lst_t_minus_10 = lag(lst, n = 10),
    lst_t_minus_11 = lag(lst, n = 11),
    lst_t_minus_12 = lag(lst, n = 12),
    lst_t_minus_13 = lag(lst, n = 13),
    lst_t_minus_14 = lag(lst, n = 14),
    lst_t_minus_15 = lag(lst, n = 15),
    lst_t_minus_16 = lag(lst, n = 16),
    lst_t_minus_17 = lag(lst, n = 17),
    lst_t_minus_18 = lag(lst, n = 18),
    lst_t_minus_19 = lag(lst, n = 19),
    lst_t_minus_20 = lag(lst, n = 20),
    lst_t_minus_21 = lag(lst, n = 21)
  )
#> mutate: new variable 'lst_t_minus_1' (double) with 46 unique values and 2% NA
#>         new variable 'lst_t_minus_2' (double) with 45 unique values and 3% NA
#>         new variable 'lst_t_minus_3' (double) with 44 unique values and 5% NA
#>         new variable 'lst_t_minus_4' (double) with 43 unique values and 6% NA
#>         new variable 'lst_t_minus_5' (double) with 42 unique values and 8% NA
#>         new variable 'lst_t_minus_6' (double) with 41 unique values and 10% NA
#>         new variable 'lst_t_minus_7' (double) with 40 unique values and 11% NA
#>         new variable 'lst_t_minus_8' (double) with 39 unique values and 13% NA
#>         new variable 'lst_t_minus_9' (double) with 38 unique values and 15% NA
#>         new variable 'lst_t_minus_10' (double) with 37 unique values and 16% NA
#>         new variable 'lst_t_minus_11' (double) with 36 unique values and 18% NA
#>         new variable 'lst_t_minus_12' (double) with 35 unique values and 19% NA
#>         new variable 'lst_t_minus_13' (double) with 34 unique values and 21% NA
#>         new variable 'lst_t_minus_14' (double) with 33 unique values and 23% NA
#>         new variable 'lst_t_minus_15' (double) with 32 unique values and 24% NA
#>         new variable 'lst_t_minus_16' (double) with 31 unique values and 26% NA
#>         new variable 'lst_t_minus_17' (double) with 30 unique values and 27% NA
#>         new variable 'lst_t_minus_18' (double) with 29 unique values and 29% NA
#>         new variable 'lst_t_minus_19' (double) with 28 unique values and 31% NA
#>         new variable 'lst_t_minus_20' (double) with 27 unique values and 32% NA
#>         new variable 'lst_t_minus_21' (double) with 26 unique values and 34% NA

# Add in the same years that I extended the temperature data with in the length so that I can do a left_join to get in the lagged variables
years_to_add2 <- temp_df2 |>
  dplyr::select(year) |>
  filter(year < min(df$year))
#> filter: removed 45 rows (73%), 17 rows remaining

df2 <- bind_rows(years_to_add2, df)

# Join in the lagged temperature variables to the length data. Before I do that though, it will be much easier indexing further down if temp (lst) is the last column...
temp_col <- df2 |> dplyr::select(lst)

df2 <- df2 |> dplyr::select(-lst)

df2$lst <- temp_col$lst

colnames(df2)
#>  [1] "year"        "Date"        "month"       "Day"         "species"    
#>  [6] "code"        "L"           "l"           "Q"           "age"        
#> [11] "sex"         "Scales"      "area"        "data_source" "stage"      
#> [16] "cohort"      "lst"

# Now join
df3 <- left_join(df2, temp_df2)
#> Joining with `by = join_by(year, stage, lst)`
#> left_join: added 21 columns (lst_t_minus_1, lst_t_minus_2, lst_t_minus_3, lst_t_minus_4, lst_t_minus_5, …)
#>            > rows only in x       17
#>            > rows only in y  (    24)
#>            > matched rows     24,043
#>            >                 ========
#>            > rows total       24,060

# Check an example: age 5 fish in 1991
df3 |>
  filter(age == 5 & year == 1991) |>
  dplyr::select(
    year, cohort, age, l, species, lst, lst_t_minus_1, lst_t_minus_2, lst_t_minus_3,
    lst_t_minus_4, lst_t_minus_5, lst_t_minus_6
  ) |>
  arrange(year) |>
  head(1)
#> filter: removed 23,486 rows (98%), 574 rows remaining
#>   year cohort age  l species      lst lst_t_minus_1 lst_t_minus_2 lst_t_minus_3
#> 1 1991   1986   5 17   Bream 18.56636      19.91319      20.51592      20.29426
#>   lst_t_minus_4 lst_t_minus_5 lst_t_minus_6
#> 1      16.20713      19.20568      18.40932

temp_df |>
  filter(year < 1992 & year > 1985) |>
  as.data.frame() |>
  arrange(desc(year))
#> filter: removed 39 rows (87%), 6 rows remaining
#>        lst year
#> 1 18.56636 1991
#> 2 19.91319 1990
#> 3 20.51592 1989
#> 4 20.29426 1988
#> 5 16.20713 1987
#> 6 19.20568 1986

# Check an example: age 10 fish in 1980
min(temp_df$year)
#> [1] 1978
df3 |>
  filter(age == 10 & year == 1980) |>
  dplyr::select(
    year, cohort, age, l, species, lst, lst_t_minus_1, lst_t_minus_2, lst_t_minus_3,
    lst_t_minus_4, lst_t_minus_5, lst_t_minus_6
  ) |>
  arrange(year) |>
  head(1)
#> filter: removed 24,017 rows (>99%), 43 rows remaining
#>   year cohort age  l species      lst lst_t_minus_1 lst_t_minus_2 lst_t_minus_3
#> 1 1980   1970  10 31   Perch 16.19834       17.3763      17.47408      17.09762
#>   lst_t_minus_4 lst_t_minus_5 lst_t_minus_6
#> 1      17.09762      17.09762      17.09762

temp_df |>
  filter(year < 1981 & year > 1969) |>
  as.data.frame() |>
  arrange(desc(year))
#> filter: removed 42 rows (93%), 3 rows remaining
#>        lst year
#> 1 16.19834 1980
#> 2 17.37630 1979
#> 3 17.47408 1978
temp_df2 |>
  filter(year < 1981 & year > 1969) |>
  as.data.frame() |>
  dplyr::select(lst, year) |>
  arrange(desc(year))
#> filter: removed 51 rows (82%), 11 rows remaining
#>         lst year
#> 1  16.19834 1980
#> 2  17.37630 1979
#> 3  17.47408 1978
#> 4  17.09762 1977
#> 5  17.09762 1976
#> 6  17.09762 1975
#> 7  17.09762 1974
#> 8  17.09762 1973
#> 9  17.09762 1972
#> 10 17.09762 1971
#> 11 17.09762 1970

df3 |>
  filter(year %in% c(1986, 1987, 1988, 1989, 1990, 1991)) |>
  dplyr::select(
    year, cohort, age, lst, lst_t_minus_1, lst_t_minus_2, lst_t_minus_3,
    lst_t_minus_4, lst_t_minus_5, lst_t_minus_6
  ) |>
  arrange(year) |>
  distinct(year, .keep_all = TRUE)
#> filter: removed 19,177 rows (80%), 4,883 rows remaining
#> distinct: removed 4,877 rows (>99%), 6 rows remaining
#>   year cohort age      lst lst_t_minus_1 lst_t_minus_2 lst_t_minus_3
#> 1 1986   1966  20 19.20568      18.40932      17.63245      17.62352
#> 2 1987   1976  11 16.20713      19.20568      18.40932      17.63245
#> 3 1988   1981   7 20.29426      16.20713      19.20568      18.40932
#> 4 1989   1974  15 20.51592      20.29426      16.20713      19.20568
#> 5 1990   1984   6 19.91319      20.51592      20.29426      16.20713
#> 6 1991   1991   0 18.56636      19.91319      20.51592      20.29426
#>   lst_t_minus_4 lst_t_minus_5 lst_t_minus_6
#> 1      17.46301      17.34176      16.19834
#> 2      17.62352      17.46301      17.34176
#> 3      17.63245      17.62352      17.46301
#> 4      18.40932      17.63245      17.62352
#> 5      19.20568      18.40932      17.63245
#> 6      16.20713      19.20568      18.40932

# All good. What remains now is to take the average across the columns of lagged temperatures. The number of columns to calculate on depends on the age.

df4 <- df3 |> drop_na(age)
#> drop_na: removed 55 rows (<1%), 24,005 rows remaining

# Fill in columns with NA before looping
df4$lifetime_avg_temp <- NA
df4$birth_temp <- NA
df4$first_two_yrs_temp <- NA

str(df4)
#> 'data.frame':    24005 obs. of  41 variables:
#>  $ year              : num  1978 1978 1978 1978 1978 ...
#>  $ Date              : chr  "1978.06.01" "1978.06.01" "1978.06.01" "1978.06.01" ...
#>  $ month             : num  6 6 6 6 6 6 6 6 6 6 ...
#>  $ Day               : num  NA NA NA NA NA NA NA NA NA NA ...
#>  $ species           : Factor w/ 11 levels "Bream","Bleak",..: 9 9 9 9 9 5 5 9 9 5 ...
#>  $ code              : chr  "34/14" "35/15" "36/16" "37/17" ...
#>  $ L                 : chr  NA NA NA NA ...
#>  $ l                 : num  17 16 15 18 18 32 36 18 17 42 ...
#>  $ Q                 : num  100 85 65 125 110 310 640 115 115 845 ...
#>  $ age               : num  7 6 6 8 7 4 5 8 8 5 ...
#>  $ sex               : chr  "9" "9" "1" "9" ...
#>  $ Scales            : chr  NA NA NA NA ...
#>  $ area              : chr  "?" "?" "?" "?" ...
#>  $ data_source       : chr  "D5" "D5" "D5" "D5" ...
#>  $ stage             : chr  "a" "a" "a" "a" ...
#>  $ cohort            : num  1971 1972 1972 1970 1971 ...
#>  $ lst               : num  17.5 17.5 17.5 17.5 17.5 ...
#>  $ lst_t_minus_1     : num  17.1 17.1 17.1 17.1 17.1 ...
#>  $ lst_t_minus_2     : num  17.1 17.1 17.1 17.1 17.1 ...
#>  $ lst_t_minus_3     : num  17.1 17.1 17.1 17.1 17.1 ...
#>  $ lst_t_minus_4     : num  17.1 17.1 17.1 17.1 17.1 ...
#>  $ lst_t_minus_5     : num  17.1 17.1 17.1 17.1 17.1 ...
#>  $ lst_t_minus_6     : num  17.1 17.1 17.1 17.1 17.1 ...
#>  $ lst_t_minus_7     : num  17.1 17.1 17.1 17.1 17.1 ...
#>  $ lst_t_minus_8     : num  17.1 17.1 17.1 17.1 17.1 ...
#>  $ lst_t_minus_9     : num  17.1 17.1 17.1 17.1 17.1 ...
#>  $ lst_t_minus_10    : num  17.1 17.1 17.1 17.1 17.1 ...
#>  $ lst_t_minus_11    : num  17.1 17.1 17.1 17.1 17.1 ...
#>  $ lst_t_minus_12    : num  17.1 17.1 17.1 17.1 17.1 ...
#>  $ lst_t_minus_13    : num  17.1 17.1 17.1 17.1 17.1 ...
#>  $ lst_t_minus_14    : num  17.1 17.1 17.1 17.1 17.1 ...
#>  $ lst_t_minus_15    : num  17.1 17.1 17.1 17.1 17.1 ...
#>  $ lst_t_minus_16    : num  17.1 17.1 17.1 17.1 17.1 ...
#>  $ lst_t_minus_17    : num  17.1 17.1 17.1 17.1 17.1 ...
#>  $ lst_t_minus_18    : num  NA NA NA NA NA NA NA NA NA NA ...
#>  $ lst_t_minus_19    : num  NA NA NA NA NA NA NA NA NA NA ...
#>  $ lst_t_minus_20    : num  NA NA NA NA NA NA NA NA NA NA ...
#>  $ lst_t_minus_21    : num  NA NA NA NA NA NA NA NA NA NA ...
#>  $ lifetime_avg_temp : logi  NA NA NA NA NA NA ...
#>  $ birth_temp        : logi  NA NA NA NA NA NA ...
#>  $ first_two_yrs_temp: logi  NA NA NA NA NA NA ...

# Loop through all rows and calculate the average lifetime temperature by averaging across as many columns the fish is old
for (i in 1:nrow(df4)) {
  # temperatures start with temp on col #17; colnames(df4); df4 |> select(17) |> head()
  # Lifetime
  df4$lifetime_avg_temp[i] <- ifelse(df4$age[i] == 0,
                                     df4[i, 17],
                                     rowMeans(df4[i, 17:(17 + (df4$age[i]))])
  )

  # Birth temp
  df4$birth_temp[i] <-
    df4[i, (17 + (df4$age[i]))]
}

# Check it worked by plotting the temperatures in the growth data against manual calculations
# Here, if age == 0, lst and birth temp and average lifetime birth temp should be the same
df4 |>
  filter(age == 0) |>
  ggplot(aes(lst, lifetime_avg_temp)) +
  geom_point() +
  geom_abline()
#> filter: removed 23,779 rows (99%), 226 rows remaining


df4 |>
  filter(age == 0) |>
  ggplot(aes(lst, birth_temp)) +
  geom_point() +
  geom_abline()
#> filter: removed 23,779 rows (99%), 226 rows remaining


# Looks alright for age zeroes...

# For age 1 fish, Test age one lifetime avg
df4 |>
  filter(age == 1) |>
  mutate(lifetime_avg_temp2 = (lst + lst_t_minus_1) / 2) |> # manually calculate the average of current temperature and lagged 1 year temperature
  ggplot(aes(lifetime_avg_temp2, lifetime_avg_temp)) +
  geom_point() +
  geom_abline()
#> filter: removed 23,410 rows (98%), 595 rows remaining
#> mutate: new variable 'lifetime_avg_temp2' (double) with 24 unique values and 0% NA


# Age 2
df4 |>
  filter(age == 2) |>
  mutate(lifetime_avg_temp2 = (lst + lst_t_minus_1 + lst_t_minus_2) / 3) |> # manually calculate the average of current temperature and lagged 1 year and lagged 2 year temperatures
  ggplot(aes(lifetime_avg_temp2, lifetime_avg_temp)) +
  geom_point() +
  geom_abline()
#> filter: removed 22,768 rows (95%), 1,237 rows remaining
#> mutate: new variable 'lifetime_avg_temp2' (double) with 35 unique values and 0% NA


# Age 3 birth temp
df4 |>
  filter(age == 3) |>
  mutate(birth_temp2 = lst_t_minus_3) |>
  ggplot(aes(birth_temp2, birth_temp)) +
  geom_point() +
  geom_abline()
#> filter: removed 21,796 rows (91%), 2,209 rows remaining
#> mutate: new variable 'birth_temp2' (double) with 34 unique values and 0% NA


# Another test... all cohorts before 1978 should have the same temperature
df4 |>
  filter(cohort < 1978) |>
  distinct(birth_temp)
#> filter: removed 19,435 rows (81%), 4,570 rows remaining
#> distinct: removed 4,569 rows (>99%), one row remaining
#>   birth_temp
#> 1   17.09762

tt <- df4 |> filter(is.na(birth_temp))
#> filter: removed all rows (100%)
tt <- df4 |> filter(is.na(lifetime_avg_temp))
#> filter: removed all rows (100%)

Explore data

# if starting from here load df4
# load(file = paste0(home, "/data/druksiai_beforeFiltering.RData"))

# Sample size
df4 |>
  group_by(year, species, age) |>
  summarise(n = n()) |>
  ggplot(aes(year, n, fill = factor(age))) +
  geom_bar(stat = "identity") +
  facet_wrap(~species, scales = "free_y")
#> group_by: 3 grouping variables (year, species, age)
#> summarise: now 1,819 rows and 4 columns, 2 group variables remaining (year, species)


# Histogram of lengths
ggplot(df4, aes(l)) +
  geom_histogram() +
  facet_wrap(~species, scales = "free")
#> `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Trim data

# Trim data somewhat, and include datasource as a random effect
df5 <- df4 |>
  ungroup() |>
  group_by(species, age) |>
  mutate(
    sd_l = sd(l, na.rm = TRUE),
    mean_l = mean(l, na.rm = TRUE)
  ) |>
  filter(is.finite(sd_l)) |>
  ungroup() |>
  mutate(
    keep = ifelse(l >= mean_l + 4 * sd_l, "N", "Y"),
    keep = ifelse(l <= mean_l - 4 * sd_l, "N", keep)
  )
#> ungroup: no grouping variables
#> group_by: 2 grouping variables (species, age)
#> mutate (grouped): new variable 'sd_l' (double) with 138 unique values and <1% NA
#>                   new variable 'mean_l' (double) with 148 unique values and 0% NA
#> filter (grouped): removed 9 rows (<1%), 23,996 rows remaining
#> ungroup: no grouping variables
#> mutate: new variable 'keep' (character) with 2 unique values and 0% NA

df5 <- df5 |> 
    tidylog::filter(!(species == "Silver bream" & l < 5 & age > 10))
#> filter: removed one row (<1%), 23,995 rows remaining

ggplot(df5, aes(age, l, color = keep)) +
  geom_jitter(alpha = 3/4) +
  facet_wrap(~species, scales = "free", ncol = 3)


df5 <- df5 |>
  mutate(
    data_source2 = case_when(
      data_source == "žvynų knygutės" ~ "scales",
      data_source == "missing_data_source" ~ "missing_data_source",
      data_source %in% c("seni kompų hardai", "senų kompų hardai") ~ "digitised_old_data",
      TRUE ~ "journal"
    )
  )
#> mutate: new variable 'data_source2' (character) with 4 unique values and 0% NA

ggplot(df5, aes(year, fill = data_source2)) +
  geom_bar(stat = "count") +
  facet_wrap(~species)


# Fix some variables
df5 <- df5 |>
  drop_na(age) |>
  drop_na(lifetime_avg_temp) |>
  drop_na(month) |>
  drop_na(l) |>
  dplyr::select(-data_source) |>
  rename(data_source = data_source2) |>
  mutate(
    f_month = as.factor(month),
    f_age = as.factor(age),
    f_data_source = as.factor(data_source)
  )
#> drop_na: no rows removed
#> drop_na: no rows removed
#> drop_na: removed 4,491 rows (19%), 19,504 rows remaining
#> drop_na: no rows removed
#> rename: renamed one variable (data_source)
#> mutate: new variable 'f_month' (factor) with 12 unique values and 0% NA
#>         new variable 'f_age' (factor) with 22 unique values and 0% NA
#>         new variable 'f_data_source' (factor) with 4 unique values and 0% NA

# vendace - 2-3 years;
# bream - 5-7 years;
# perch - 2-3 years;
# roach - 3-5 years;
# pike - 3 years.
# Bleak (Alburnus) – 2-3 years;
# Silverbream – 3-4 years
# Ruffe – 2 years
# Smelt – 2 years
# Rudde (Scardinius) – 3-4 years
# Tench (Tinca) – 3-5 years

# Plot age distributions to figure out which constitutes "old" fish

# Add maturation age by species
df5 <- df5 |>
  mutate(
    life_stage = "Adult",
    life_stage = ifelse(species == "Vendace" & age %in% c(2, 3), "Maturation age", life_stage),
    life_stage = ifelse(species == "Vendace" & age < 2, "Juvenile", life_stage),
    life_stage = ifelse(species == "Bream" & age %in% c(5, 6, 7), "Maturation age", life_stage),
    life_stage = ifelse(species == "Bream" & age < 5, "Juvenile", life_stage),
    life_stage = ifelse(species == "Perch" & age %in% c(2, 3), "Maturation age", life_stage),
    life_stage = ifelse(species == "Perch" & age < 2, "Juvenile", life_stage),
    life_stage = ifelse(species == "Roach" & age %in% c(3, 4, 5), "Maturation age", life_stage),
    life_stage = ifelse(species == "Roach" & age < 3, "Juvenile", life_stage),
    life_stage = ifelse(species == "Pike" & age %in% c(3), "Maturation age", life_stage),
    life_stage = ifelse(species == "Pike" & age < 3, "Juvenile", life_stage),
    life_stage = ifelse(species == "Bleak" & age %in% c(2, 3), "Maturation age", life_stage),
    life_stage = ifelse(species == "Bleak" & age < 2, "Juvenile", life_stage),
    life_stage = ifelse(species == "Silver bream" & age %in% c(3, 4), "Maturation age", life_stage),
    life_stage = ifelse(species == "Silver bream" & age < 3, "Juvenile", life_stage),
    life_stage = ifelse(species == "Ruffe" & age %in% c(2), "Maturation age", life_stage),
    life_stage = ifelse(species == "Ruffe" & age < 2, "Juvenile", life_stage),
    life_stage = ifelse(species == "Smelt" & age %in% c(2), "Maturation age", life_stage),
    life_stage = ifelse(species == "Smelt" & age < 2, "Juvenile", life_stage),
    life_stage = ifelse(species == "Rudde" & age %in% c(3, 4), "Maturation age", life_stage),
    life_stage = ifelse(species == "Rudde" & age < 3, "Juvenile", life_stage),
    life_stage = ifelse(species == "Tench" & age %in% c(3, 4, 5), "Maturation age", life_stage),
    life_stage = ifelse(species == "Tench" & age < 3, "Juvenile", life_stage)
  ) |>
  mutate(break_age = max(age) / 2) |>
  mutate(life_stage = ifelse(age > break_age & life_stage == "Adult", "Old adult", life_stage))
#> mutate: new variable 'life_stage' (character) with 3 unique values and 0% NA
#> mutate: new variable 'break_age' (double) with one unique value and 0% NA
#> mutate: changed 3,303 values (17%) of 'life_stage' (0 new NA)

ggplot(df5, aes(age, fill = life_stage)) +
  geom_histogram() +
  facet_wrap(~species, scales = "free")
#> `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# Final exploration:
# yearly data distribution by species. Note, smelt has too few years and cohorts (short lived), so we will remove it
df5 |>
  group_by(species) |>
  distinct(cohort, .keep_all = TRUE) |>
  ggplot(aes(cohort, species)) +
  geom_point()
#> group_by: one grouping variable (species)
#> distinct (grouped): removed 19,066 rows (98%), 438 rows remaining


# smelt has only 12 cohorts, other species >20
df5 |>
  group_by(species) |>
  summarise(n_cohorts = n_distinct(cohort)) |>
  arrange(n_cohorts, desc(n_cohorts))
#> group_by: one grouping variable (species)
#> summarise: now 11 rows and 2 columns, ungrouped
#> # A tibble: 11 × 2
#>    species      n_cohorts
#>    <fct>            <int>
#>  1 Smelt               12
#>  2 Ruffe               24
#>  3 Bleak               30
#>  4 Vendace             37
#>  5 Tench               39
#>  6 Silver bream        41
#>  7 Rudde               41
#>  8 Pike                46
#>  9 Bream               54
#> 10 Roach               56
#> 11 Perch               58
df6 <- df5 |>
  filter(!species == "Smelt") |> 
  filter(keep == "Y") |> 
  dplyr::select(
    year, species, l, age, f_age, f_month, stage, cohort, lst,
    lifetime_avg_temp, birth_temp, data_source, f_data_source, life_stage
  )
#> filter: removed 171 rows (1%), 19,333 rows remaining
#> filter: removed 135 rows (1%), 19,198 rows remaining

save(df6, file = paste0(home, "/data/druksiai2.RData"))